In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from folium.plugins import HeatMap
from datetime import datetime
In [15]:
df = pd.read_csv("/Users/poojareddykolimi/Downloads/US_Accidents_March23.csv")
In [3]:
df.head()
Out[3]:
| ID | Source | Severity | Start_Time | End_Time | Start_Lat | Start_Lng | End_Lat | End_Lng | Distance(mi) | ... | Roundabout | Station | Stop | Traffic_Calming | Traffic_Signal | Turning_Loop | Sunrise_Sunset | Civil_Twilight | Nautical_Twilight | Astronomical_Twilight | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | A-1 | Source2 | 3 | 2016-02-08 05:46:00 | 2016-02-08 11:00:00 | 39.865147 | -84.058723 | NaN | NaN | 0.01 | ... | False | False | False | False | False | False | Night | Night | Night | Night |
| 1 | A-2 | Source2 | 2 | 2016-02-08 06:07:59 | 2016-02-08 06:37:59 | 39.928059 | -82.831184 | NaN | NaN | 0.01 | ... | False | False | False | False | False | False | Night | Night | Night | Day |
| 2 | A-3 | Source2 | 2 | 2016-02-08 06:49:27 | 2016-02-08 07:19:27 | 39.063148 | -84.032608 | NaN | NaN | 0.01 | ... | False | False | False | False | True | False | Night | Night | Day | Day |
| 3 | A-4 | Source2 | 3 | 2016-02-08 07:23:34 | 2016-02-08 07:53:34 | 39.747753 | -84.205582 | NaN | NaN | 0.01 | ... | False | False | False | False | False | False | Night | Day | Day | Day |
| 4 | A-5 | Source2 | 2 | 2016-02-08 07:39:07 | 2016-02-08 08:09:07 | 39.627781 | -84.188354 | NaN | NaN | 0.01 | ... | False | False | False | False | True | False | Day | Day | Day | Day |
5 rows × 46 columns
Data Preprocessing¶
In [17]:
print("Shape:", df.shape)
print("Columns:", df.columns)
print(df.dtypes)
Shape: (7728394, 46)
Columns: Index(['ID', 'Source', 'Severity', 'Start_Time', 'End_Time', 'Start_Lat',
'Start_Lng', 'End_Lat', 'End_Lng', 'Distance(mi)', 'Description',
'Street', 'City', 'County', 'State', 'Zipcode', 'Country', 'Timezone',
'Airport_Code', 'Weather_Timestamp', 'Temperature(F)', 'Wind_Chill(F)',
'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Direction',
'Wind_Speed(mph)', 'Precipitation(in)', 'Weather_Condition', 'Amenity',
'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway',
'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal',
'Turning_Loop', 'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight',
'Astronomical_Twilight'],
dtype='object')
ID object
Source object
Severity int64
Start_Time object
End_Time object
Start_Lat float64
Start_Lng float64
End_Lat float64
End_Lng float64
Distance(mi) float64
Description object
Street object
City object
County object
State object
Zipcode object
Country object
Timezone object
Airport_Code object
Weather_Timestamp object
Temperature(F) float64
Wind_Chill(F) float64
Humidity(%) float64
Pressure(in) float64
Visibility(mi) float64
Wind_Direction object
Wind_Speed(mph) float64
Precipitation(in) float64
Weather_Condition object
Amenity bool
Bump bool
Crossing bool
Give_Way bool
Junction bool
No_Exit bool
Railway bool
Roundabout bool
Station bool
Stop bool
Traffic_Calming bool
Traffic_Signal bool
Turning_Loop bool
Sunrise_Sunset object
Civil_Twilight object
Nautical_Twilight object
Astronomical_Twilight object
dtype: object
In [18]:
missing = df.isnull().sum()
missing_percent = (missing / len(df)) * 100
missing_df = pd.DataFrame({'Missing Values': missing, 'Percent': missing_percent})
print(missing_df[missing_df['Missing Values'] > 0].sort_values(by='Percent', ascending=False))
Missing Values Percent End_Lat 3402762 44.029355 End_Lng 3402762 44.029355 Precipitation(in) 2203586 28.512858 Wind_Chill(F) 1999019 25.865904 Wind_Speed(mph) 571233 7.391355 Visibility(mi) 177098 2.291524 Wind_Direction 175206 2.267043 Humidity(%) 174144 2.253301 Weather_Condition 173459 2.244438 Temperature(F) 163853 2.120143 Pressure(in) 140679 1.820288 Weather_Timestamp 120228 1.555666 Sunrise_Sunset 23246 0.300787 Civil_Twilight 23246 0.300787 Nautical_Twilight 23246 0.300787 Astronomical_Twilight 23246 0.300787 Airport_Code 22635 0.292881 Street 10869 0.140637 Timezone 7808 0.101030 Zipcode 1915 0.024779 City 253 0.003274 Description 5 0.000065
In [19]:
columns_to_drop = [
'ID', 'Source', 'Description', 'Street',
'End_Lat', 'End_Lng',
'Wind_Chill(F)', 'Wind_Direction', 'Airport_Code',
'Amenity', 'Turning_Loop',
'Civil_Twilight', 'Nautical_Twilight', 'Astronomical_Twilight'
]
df.drop(columns=columns_to_drop, inplace=True)
In [20]:
df.columns
Out[20]:
Index(['Severity', 'Start_Time', 'End_Time', 'Start_Lat', 'Start_Lng',
'Distance(mi)', 'City', 'County', 'State', 'Zipcode', 'Country',
'Timezone', 'Weather_Timestamp', 'Temperature(F)', 'Humidity(%)',
'Pressure(in)', 'Visibility(mi)', 'Wind_Speed(mph)',
'Precipitation(in)', 'Weather_Condition', 'Bump', 'Crossing',
'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station',
'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Sunrise_Sunset'],
dtype='object')
In [22]:
base_score = df['Severity'] * 10
env_score = (
df['Temperature(F)'].apply(lambda x: 5 if (x < 32 or x > 100) else 0) +
df['Humidity(%)'].apply(lambda x: 5 if x > 90 else 0) +
df['Visibility(mi)'].apply(lambda x: 10 if x < 1 else (5 if x < 3 else 0)) +
df['Wind_Speed(mph)'].apply(lambda x: 5 if x > 30 else 0) +
df['Precipitation(in)'].apply(lambda x: 10 if x > 0.1 else 0)
)
infra_score = (
df[['Junction', 'Crossing', 'Railway', 'Roundabout', 'Stop']].sum(axis=1) * 5
- df[['Traffic_Calming', 'Traffic_Signal']].sum(axis=1) * 2
)
df['Hour'] = pd.to_datetime(df['Start_Time'], format='mixed', errors='coerce').dt.hour
time_score = df['Hour'].apply(lambda h: 5 if h < 6 or h > 20 else 0)
df['Risk_Score'] = base_score + env_score + infra_score + time_score
df['Risk_Score'] = base_score + env_score + infra_score + time_score
df['Risk_Score'] = df['Risk_Score'] / df['Risk_Score'].max() * 100
Exploratory Data Analysis¶
In [7]:
df['State'].value_counts().head(10).plot(kind='bar', title='Top 10 States by Accident Count')
plt.show()
df['City'].value_counts().head(10).plot(kind='bar', title='Top 10 Cities by Accident Count')
plt.show()
In [8]:
import folium
from folium.plugins import HeatMap
from IPython.display import display
heat_df = df[['Start_Lat', 'Start_Lng']].dropna().sample(n=10000)
map_center = [heat_df['Start_Lat'].mean(), heat_df['Start_Lng'].mean()]
base_map = folium.Map(location=map_center, zoom_start=5)
HeatMap(heat_df.values.tolist()).add_to(base_map)
display(base_map)
Make this Notebook Trusted to load map: File -> Trust Notebook
In [9]:
df['Start_Time'] = pd.to_datetime(df['Start_Time'], format='mixed', errors='coerce')
In [10]:
df['Hour'] = df['Start_Time'].dt.hour
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10, 6))
sns.countplot(x='Hour', data=df, palette='viridis')
plt.title('Accidents by Hour of Day')
plt.xlabel('Hour')
plt.ylabel('Number of Accidents')
plt.tight_layout()
plt.show()
/var/folders/bm/jkk1ylls6z9_16zmffdppvwh0000gn/T/ipykernel_33694/1026542018.py:7: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.countplot(x='Hour', data=df, palette='viridis')
In [11]:
df['Day'] = df['Start_Time'].dt.day_name()
plt.figure(figsize=(10, 6))
sns.countplot(x='Day', data=df, order=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'])
plt.title('Accidents by Day of Week')
Out[11]:
Text(0.5, 1.0, 'Accidents by Day of Week')
In [12]:
env_features = ['Temperature(F)', 'Humidity(%)', 'Visibility(mi)', 'Wind_Speed(mph)', 'Precipitation(in)']
print(df[env_features].describe())
plt.figure(figsize=(10, 8))
sns.heatmap(df[['Severity', 'Temperature(F)', 'Humidity(%)', 'Visibility(mi)',
'Wind_Speed(mph)', 'Precipitation(in)']].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
Temperature(F) Humidity(%) Visibility(mi) Wind_Speed(mph) \
count 7.564541e+06 7.554250e+06 7.551296e+06 7.157161e+06
mean 6.166329e+01 6.483104e+01 9.090376e+00 7.685490e+00
std 1.901365e+01 2.282097e+01 2.688316e+00 5.424983e+00
min -8.900000e+01 1.000000e+00 0.000000e+00 0.000000e+00
25% 4.900000e+01 4.800000e+01 1.000000e+01 4.600000e+00
50% 6.400000e+01 6.700000e+01 1.000000e+01 7.000000e+00
75% 7.600000e+01 8.400000e+01 1.000000e+01 1.040000e+01
max 2.070000e+02 1.000000e+02 1.400000e+02 1.087000e+03
Precipitation(in)
count 5.524808e+06
mean 8.407210e-03
std 1.102246e-01
min 0.000000e+00
25% 0.000000e+00
50% 0.000000e+00
75% 0.000000e+00
max 3.647000e+01
Out[12]:
Text(0.5, 1.0, 'Correlation Matrix')
California Analysis¶
In [23]:
# Extract California data
df_ca = df[df['State'] == 'CA'].copy()
print(f"Number of accidents in California: {len(df_ca)}")
Number of accidents in California: 1741433
In [24]:
# Basic structure
print(df_ca.info())
# Missing values
missing_ca = df_ca.isnull().sum()
missing_percent = (missing_ca / len(df_ca)) * 100
missing_df_ca = pd.DataFrame({'Missing Values': missing_ca, 'Percent': missing_percent})
print(missing_df_ca[missing_df_ca['Missing Values'] > 0].sort_values(by='Percent', ascending=False))
<class 'pandas.core.frame.DataFrame'>
Index: 1741433 entries, 728 to 7728393
Data columns (total 34 columns):
# Column Dtype
--- ------ -----
0 Severity int64
1 Start_Time object
2 End_Time object
3 Start_Lat float64
4 Start_Lng float64
5 Distance(mi) float64
6 City object
7 County object
8 State object
9 Zipcode object
10 Country object
11 Timezone object
12 Weather_Timestamp object
13 Temperature(F) float64
14 Humidity(%) float64
15 Pressure(in) float64
16 Visibility(mi) float64
17 Wind_Speed(mph) float64
18 Precipitation(in) float64
19 Weather_Condition object
20 Bump bool
21 Crossing bool
22 Give_Way bool
23 Junction bool
24 No_Exit bool
25 Railway bool
26 Roundabout bool
27 Station bool
28 Stop bool
29 Traffic_Calming bool
30 Traffic_Signal bool
31 Sunrise_Sunset object
32 Hour int32
33 Risk_Score float64
dtypes: bool(11), float64(10), int32(1), int64(1), object(11)
memory usage: 330.5+ MB
None
Missing Values Percent
Precipitation(in) 566204 32.513683
Wind_Speed(mph) 162891 9.353848
Humidity(%) 48341 2.775932
Temperature(F) 45969 2.639723
Visibility(mi) 40125 2.304137
Weather_Condition 39778 2.284211
Pressure(in) 37126 2.131922
Weather_Timestamp 32805 1.883793
Sunrise_Sunset 1343 0.077120
Zipcode 597 0.034282
Timezone 597 0.034282
City 11 0.000632
In [15]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.countplot(data=df_ca, x='Severity', palette='rocket')
plt.title('Accident Severity Distribution in California')
plt.xlabel('Severity Level')
plt.ylabel('Number of Accidents')
plt.show()
/var/folders/bm/jkk1ylls6z9_16zmffdppvwh0000gn/T/ipykernel_33694/3786709651.py:4: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.countplot(data=df_ca, x='Severity', palette='rocket')
In [25]:
df_ca['Start_Time'] = pd.to_datetime(df_ca['Start_Time'], errors='coerce')
# Hour of Day
df_ca['Hour'] = df_ca['Start_Time'].dt.hour
sns.countplot(x='Hour', data=df_ca, palette='viridis')
plt.title('Accidents by Hour (California)')
plt.xlabel('Hour of Day')
plt.ylabel('Accident Count')
plt.show()
/var/folders/bm/jkk1ylls6z9_16zmffdppvwh0000gn/T/ipykernel_34131/4055911724.py:5: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.countplot(x='Hour', data=df_ca, palette='viridis')
In [12]:
df_ca['Day'] = df_ca['Start_Time'].dt.day_name()
sns.countplot(x='Day', data=df_ca, order=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'], palette='pastel')
plt.title('Accidents by Day of the Week (California)')
plt.xticks(rotation=45)
plt.ylabel('Accident Count')
plt.show()
/var/folders/bm/jkk1ylls6z9_16zmffdppvwh0000gn/T/ipykernel_34131/620810245.py:2: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.countplot(x='Day', data=df_ca, order=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'], palette='pastel')
In [18]:
top_weather_ca = df_ca['Weather_Condition'].value_counts().nlargest(10).index
sns.countplot(y='Weather_Condition', data=df_ca[df_ca['Weather_Condition'].isin(top_weather_ca)],
order=top_weather_ca, palette='cool')
plt.title('Top Weather Conditions in CA Accidents')
plt.xlabel('Accident Count')
plt.ylabel('Weather Condition')
plt.show()
/var/folders/bm/jkk1ylls6z9_16zmffdppvwh0000gn/T/ipykernel_33694/1688842507.py:2: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.countplot(y='Weather_Condition', data=df_ca[df_ca['Weather_Condition'].isin(top_weather_ca)],
In [19]:
# Top 15 cities in CA
top_cities = df_ca['City'].value_counts().nlargest(15)
top_cities.plot(kind='barh', figsize=(10,6), color='skyblue')
plt.title('Top 15 Cities by Number of Accidents in CA')
plt.xlabel('Accident Count')
plt.gca().invert_yaxis()
plt.show()
In [26]:
# env_cols = ['Temperature(F)', 'Humidity(%)', 'Visibility(mi)', 'Wind_Speed(mph)', 'Pressure(in)']
# print(df_ca[env_cols].describe())
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Convert Start_Time if not already
df_ca['Start_Time'] = pd.to_datetime(df_ca['Start_Time'], errors='coerce')
# Extract hour and define a new column
df_ca['Hour'] = df_ca['Start_Time'].dt.hour
df_ca['Sunset_Surge'] = df_ca['Hour'].apply(lambda x: 'After Sunset (6-9PM)' if 18 <= x <= 21 else
('Late Night' if 22 <= x <= 4 else
'Rest of Day'))
# Countplot
sns.countplot(data=df_ca, x='Sunset_Surge', order=['After Sunset (6-9PM)', 'Late Night', 'Rest of Day'], palette='autumn')
plt.title('Accident Frequency During and After Sunset in CA')
plt.ylabel('Number of Accidents')
plt.xlabel('Time Category')
plt.show()
/var/folders/bm/jkk1ylls6z9_16zmffdppvwh0000gn/T/ipykernel_34131/4063657266.py:17: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.countplot(data=df_ca, x='Sunset_Surge', order=['After Sunset (6-9PM)', 'Late Night', 'Rest of Day'], palette='autumn')
In [ ]:
from sklearn.cluster import DBSCAN
from geopy.distance import great_circle
from shapely.geometry import MultiPoint
# Filter for Los Angeles
df_la = df[(df['State'] == 'CA') & (df['City'].str.lower() == 'los angeles')]
coords = df_la[['Start_Lat', 'Start_Lng']].dropna().to_numpy()
# Optional: Sample for performance
coords_sample = coords if len(coords) < 10000 else coords[:10000]
# Haversine DBSCAN requires radians
from sklearn.preprocessing import StandardScaler
kms_per_radian = 6371.0088
epsilon = 1 / kms_per_radian # 1 km radius
db = DBSCAN(eps=epsilon, min_samples=20, algorithm='ball_tree', metric='haversine').fit(np.radians(coords_sample))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
print(f"Number of clusters (hotspots) in LA: {num_clusters}")
df_clusters = pd.DataFrame(coords_sample, columns=['lat', 'lng'])
df_clusters['cluster'] = cluster_labels
# Plot clusters
plt.figure(figsize=(10, 8))
colors = plt.cm.get_cmap('tab10', num_clusters)
for cluster_num in range(num_clusters):
cluster = df_clusters[df_clusters.cluster == cluster_num]
plt.scatter(cluster['lng'], cluster['lat'], s=10, label=f'Cluster {cluster_num}', alpha=0.6)
# Noise points (label -1)
noise = df_clusters[df_clusters.cluster == -1]
plt.scatter(noise['lng'], noise['lat'], s=5, c='gray', alpha=0.3, label='Noise')
plt.title('Accident Hotspots in Los Angeles (DBSCAN Clusters)')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
Number of clusters (hotspots) in LA: 9
/var/folders/bm/jkk1ylls6z9_16zmffdppvwh0000gn/T/ipykernel_33694/1774305026.py:28: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed in 3.11. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap()`` or ``pyplot.get_cmap()`` instead.
colors = plt.cm.get_cmap('tab10', num_clusters)
In [61]:
import pandas as pd
import numpy as np
import folium
from folium.plugins import MarkerCluster
from sklearn.cluster import DBSCAN
from IPython.display import display
# Filter for LA accidents
df_ca = df[(df['State'] == 'CA')]
coords = df_ca[['Start_Lat', 'Start_Lng']].dropna().sample(n=3000, random_state=42)
# DBSCAN clustering
kms_per_radian = 6371.0088
epsilon = 1.0 / kms_per_radian # ~1km
db = DBSCAN(eps=epsilon, min_samples=20, algorithm='ball_tree', metric='haversine')
cluster_labels = db.fit_predict(np.radians(coords[['Start_Lat', 'Start_Lng']]))
coords['Cluster'] = cluster_labels
# Initialize map
la_center = [coords['Start_Lat'].mean(), coords['Start_Lng'].mean()]
folium_map = folium.Map(location=la_center, zoom_start=11, tiles='CartoDB positron')
marker_cluster = MarkerCluster().add_to(folium_map)
colors = ['red', 'blue', 'green', 'orange', 'purple', 'pink', 'brown', 'gray', 'cadetblue', 'darkred']
for idx, row in coords.iterrows():
cluster_id = row['Cluster']
color = colors[int(cluster_id) % len(colors)] if cluster_id != -1 else 'lightgray'
folium.CircleMarker(
location=(row['Start_Lat'], row['Start_Lng']),
radius=3,
color=color,
fill=True,
fill_opacity=0.7,
popup=f"Cluster {int(cluster_id)}" if cluster_id != -1 else "Noise"
).add_to(marker_cluster)
display(folium_map)
Make this Notebook Trusted to load map: File -> Trust Notebook
In [ ]: